library(qtlcharts)
library(CountClust)
library(parallel)
library(cellcycleR)
library(data.table)
library(binhf)
library(vioplot)
library(limma)
library(readxl)

Background

In this script, we perform the cellcycleR method on the single cell data collected by John Marioni’s group (see the data here). For the paper with the analysis of this data, see Buttner et al 2015. The experimental description as provided by the authors is given below

Experimental Description (by Marioni’s group)

Data Description

We first load the data and then explore the data

setwd('/Users/kushal/Documents/singleCell-method/project/analysis')
G1_single <- data.frame(fread('../data/Marioni_data/G1_singlecells_counts.txt'), row.names=1);
G2M_single <- data.frame(fread('../data/Marioni_data/G2M_singlecells_counts.txt'), row.names=1);
S_single <- data.frame(fread('../data/Marioni_data/S_singlecells_counts.txt'), row.names=1);

cell_phases <- c(rep("G1", 96), rep("S", 96), rep("G2M", 96))

We filter out the ERCC spike-in controls

ercc_start <- grep("ERCC", rownames(G1_single))[1]

G1_single <- G1_single[-(ercc_start:dim(G1_single)[1]),-(1:3)];

dim(G1_single)
## [1] 38293    96
G2M_single <- G2M_single[-(ercc_start:dim(G2M_single)[1]),-(1:3)];

dim(G2M_single)
## [1] 38293    96
S_single <- S_single[-(ercc_start:dim(S_single)[1]),-(1:3)];

dim(S_single)
## [1] 38293    96

Note that there are dim(S_single)[2] cells in each of the three phases. All the cells from one phase were then sequenced in one plate. So, we are likely to observe plate effects which could be confounded with the genetic effects. Pooling the data from G1, G2 and S phases.

pooled_data <- t(cbind(G1_single, S_single, G2M_single));

Filtering cell cycle genes

Next we filter out the cell cycle genes for mouse (Source: ??).

cell_cycle_genes <- as.vector(as.matrix((read_excel('../data/Marioni_data/cellcycle_genes_mouse.xlsx'))))

matched_indices <- match(cell_cycle_genes,colnames(pooled_data))
cycle_counts_data <- pooled_data[,matched_indices];
cycle_counts_data <- cycle_counts_data[, -which(colSums(cycle_counts_data)==0)]

dim(cycle_counts_data)
## [1] 288 886

Next for each gene, we adjust for the mean and scale by the standard deviation. Then we filter out all the genes that have 0 expression across all the cells.

cycle_voom_data <- voom(cycle_counts_data)$E;
cycle_data_norm <- apply(cycle_voom_data,2,function(x)  return (x-mean(x))/sd(x))
celltime_levels <- 100;
cycle_data_norm <- cycle_data_norm[, -which(colSums(cycle_data_norm)==0)]

dim(cycle_data_norm)
## [1] 288 883

cellcyleR on Marioni data

out <- cell_ordering_class(cycle_data_norm, celltime_levels = 100, num_iter=100, 
                             save_path="../rdas/cell_order_marioni.rda")

We ran the method above once already (took around 30 minutes) and now we just load the output.

out <- get(load(file="../rdas/cell_order_marioni.rda"));
cell_order_full <- cell_ordering_full(out$signal_intensity, dim(cycle_data_norm)[2])

We look at the plots of the amplitudes, phases and the non signal variation of the genes.

amp_genes <- out$amp;
sd_genes <- out$sigma;
phi_genes <- out$phi;

plot(density(phi_genes), col="red", main="Density plot of the phases")

plot(density(amp_genes[-which.max(amp_genes)]), col="red", main="Density plot of the amplitudes")

plot(density(sd_genes[-which.max(sd_genes)]), col="red", main="Density plot of the non-signal sd")

We extract the genes with high SNR - these are more likely to be sinusoidal.

ESS <- amp_genes^2; RSS <- sd_genes^2

SNR <- ESS/RSS;

plot(SNR, col="red", pch=20, lwd=1)

top_genes <- which(SNR > 3);

Next we plot the qtlcharts for these top sinusoidal genes and see if their patterns are indeed sinusoidal or not.

iplotCurves(t(cycle_data_norm[order(cell_order_full),top_genes]))
## Set screen size to height=700 x width=1000

We also provide the violin plot of the cell orders estimated against the labels for G1, S and G2 obtained from the experimenters using FUCCI. There are 96 cells from G1, S and G2. So, we perform qtlcharts keeping the relative order of the sample and see how the patterns look like (since numbers for three phases are same - 96, you can view it as uniformly split).

vioplot(cell_order_full[which(cell_phases=="G1")],
        cell_order_full[which(cell_phases=="S")],
        cell_order_full[which(cell_phases=="G2M")],
        names=c("G1","S","G2M"),
        col="red")

iplotCurves(t(cycle_data_norm[c(which(cell_phases=="G1"),which(cell_phases=="S"), which(cell_phases=="G2M")),top_genes]))

Redoing on high SNR genes

We extract the high SNR genes as they seem to have sinusoidal patterns and repeat the procedure again. The aim here is to see how much robust the analysis is to the selction of the (most) sinusoidal genes.

snr_high_indices <- which(SNR > 1);

cycle_data_norm_sinusoidal <- cycle_data_norm[,snr_high_indices];

dim(cycle_data_norm_sinusoidal)
## [1] 288 647

We apply the cell ordering mechanism (it takes around 5 minutes to run)

out2 <- cell_ordering_class(cycle_data_norm_sinusoidal, celltime_levels = 100, num_iter=100,
                            save_path="../rdas/cell_order_marioni_sinusoidal.rda")

We reload the output

out2 <- get(load(file="../rdas/cell_order_marioni_sinusoidal.rda"));
cell_order_full <- cell_ordering_full(out2$signal_intensity, dim(cycle_data_norm_sinusoidal)[2])

We plot the same features as above and check for the robustness. We needed to shift the cell order so as to compare with previous plot on all genes as the method is non-identifiable upto a rotation.

amp_genes <- out2$amp;
sd_genes <- out2$sigma;
phi_genes <- out2$phi;

plot(density(phi_genes), col="red", main="Density plot of the phases")

plot(density(amp_genes[-which.max(amp_genes)]), col="red", main="Density plot of the amplitudes")

plot(density(sd_genes[-which.max(sd_genes)]), col="red", main="Density plot of the non-signal sd")

ESS <- amp_genes^2; RSS <- sd_genes^2

SNR <- ESS/RSS;

plot(SNR, col="red", pch=20, lwd=1)

top_genes <- which(SNR > 0.5);
new_cell_order <- shift(order(cell_order_full),40,dir="right")
iplotCurves(t(cycle_data_norm_sinusoidal[new_cell_order,top_genes]))

Final Thoughts

While the method is indeed trying to give us sinusoidal patterns, but it is over representing the noisy or outlying cells, we did not scale the expression relative to cells (which I do not know would be recommended here), but seems like the cells with generally very high or low expression are forming the peaks or the troughs of the sinusoids, which probably is not what we want. But still though it seems the method is not doing what is expected, I kind of fancy the fact the method is doing a good job at trying to pool them together to form a sinusoid. Seems to me we only need to fix these rogue cells to get a more sinusoidal pattern. But should we just remove them? or scale them? are we losing info keeping these cells out?

sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin14.3.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] readxl_0.1.0.9000  limma_3.24.15      vioplot_0.2       
##  [4] sm_2.2-5.4         binhf_1.0-1        adlift_1.3-2      
##  [7] EbayesThresh_1.3.2 wavethresh_4.6.6   MASS_7.3-40       
## [10] data.table_1.9.4   cellcycleR_0.1.0   CountClust_0.1.0  
## [13] qtlcharts_0.5-13  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.1      rstudioapi_0.3.1 knitr_1.11       magrittr_1.5    
##  [5] stringr_1.0.0    plyr_1.8.3       tools_3.2.0      htmltools_0.2.6 
##  [9] yaml_2.1.13      digest_0.6.8     reshape2_1.4.1   formatR_1.2.1   
## [13] htmlwidgets_0.4  evaluate_0.8     rmarkdown_0.7    stringi_0.5-5   
## [17] jsonlite_0.9.16  chron_2.3-47